train$year <- as.integer(format(as.Date(train$Date), "%y"))
train$day <- as.integer(format(as.Date(train$Date), "%d"))
head(train)
test$month <- as.integer(format(as.Date(test$Date), "%m"))
test$year <- as.integer(format(as.Date(test$Date), "%y"))
test$day <- as.integer(format(as.Date(test$Date), "%d"))
head(test)
sort(unique(test$Date))
print(sort(unique(test$Date)))
cat("Что же получилось в итоге?")
summary(train)
plot(table(train$Sales))
###############################################################################
cat("Визуализация")
library(forecast)
agg<-aggregate(Sales ~ ., data=train[, c("Sales", "month" ,"year")], FUN=sum)
ts<-ts(agg$Sales/1000000, start=2013, frequency=12)
col = rainbow(3)
seasonplot(ts, col=col, year.labels.left = TRUE, pch=19, las=1, main = "Продажи по годам", ylab = "Суммарные продажи(млн)")
meanAnnual<-aggregate(Sales ~ ., data=agg[, c("Sales" ,"year")], FUN=mean)[-3,]
abline(h=meanAnnual[1,2]/1000000, col=col[1], lty=2)
abline(h=meanAnnual[2,2]/1000000, col=col[2], lty=2)
abline(v=7, col="grey50", lty=2)
########################################
cat("Визуализация")
library(forecast)
agg<-aggregate(Sales ~ ., data=train[train$Store==1, c("Sales", "month" ,"year")], FUN=sum)
ts<-ts(agg$Sales/1000000, start=2013, frequency=12)
col = rainbow(3)
seasonplot(ts, col=col, year.labels.left = TRUE, pch=19, las=1, main = "Продажи по годам (Store 1)", ylab = "Продажи")
meanAnnual<-aggregate(Sales ~ ., data=agg[, c("Sales" ,"year")], FUN=mean)[-3,]
abline(h=meanAnnual[1,2]/1000000, col=col[1], lty=2)
abline(h=meanAnnual[2,2]/1000000, col=col[2], lty=2)
abline(v=7, col="grey50", lty=2)
########################################
plotStore <- function(dt, store) {
dt <- subset(dt, dt$Store == store)
dt$Date = as.Date(dt$Date, format = "%Y-%m-%d")
dt <- dt[order(dt$Date),]
plot(dt$Sales ~ dt$Date,
col = dt$DayOfWeek,
pch = (as.integer(dt$StateHoliday)) + 19,
main = paste("store", store, "  n =", nrow(dt), "  ", "Assortment = ", dt$Assortment[1], ", StoreType = ",  dt$StoreType[1]))
legend("topleft", legend=levels(factor(dt$DayOfWeek)), text.col=seq_along(levels(factor(dt$DayOfWeek))))
return
}
sapply(sample(length(unique(train$Store)), 20), function(store) {
# sapply(1:length(unique(train$Store)), function(store) {
plotStore(train, store)
})
table(train$DayOfWeek)
###############################################################################
cat("Удаляем лишнее\n")
train$Date <- NULL
test$Date <- NULL
train$Customers <- NULL
###############################################################################
cat("Пытаемся обучить RF")
clf <- randomForest(train[,-c(4)], train$Sales, ntree=10, do.trace=TRUE)
cat("С первого раза не получилось...\n")
colSums(is.na(train))
train[is.na(train)] <- -99999
test[is.na(test)] <- -99999
head(train,1)
samp <- sample(1:nrow(train), 10000)
clf <- randomForest(train[samp,-c(3)], train$Sales[samp], ntree=10, do.trace=TRUE)
cat("Feature Importance\n")
varImpPlot(clf)
cat("Легко дообучить\n")
clf <- grow(clf, 10)
print(clf)
###############################################################################
val <- train[train$year ==  15 & train$month >= 6,]
train <- train[train$year ==  15 & train$month < 6,]
head(train,1)
head(val,1)
samp <- sample(1:nrow(train), 20000)
clf <- randomForest(train[samp,-c(3)], train$Sales[samp], ntree=10, do.trace=TRUE)
pred <- predict(clf, val[,-c(3)])
rmspe <- function(actual, pred, lev = NULL, model = NULL) {
cond <- actual > 0
out <- sqrt(mean(((actual[cond]-pred[cond])/actual[cond])^2))
names(out) <- "RMSPE"
out
}
cat("Первый результат:")
rmspe(val$Sales,pred)
plot(clf)
###############################################################################
library(caret)
set.seed(825)
rmspe <- function(data, lev = NULL, model = NULL) {
cond <- data$obs > 0
out <- sqrt(mean(((data$obs[cond]-data$pred[cond])/data$obs[cond])^2))
names(out) <- "RMSPE"
out
}
fitControl <- trainControl(
## 2-fold CV
method = "repeatedcv",
number = 2,
## repeated 2 times
repeats = 2,
summaryFunction=rmspe)
rfFit1 <- train(Sales ~ ., data = train[samp,],
method = "rf",  ntree=10,
trControl = fitControl,
metric = "RMSPE",
maximize = F,
verbose = T)
rfFit1
trellis.par.set(caretTheme())
ggplot(rfFit1)
fitControl <- trainControl(method = "none")
set.seed(825)
rfFit2 <- train(Sales ~ ., data = train[samp,],
method = "rf", ntree=10,
trControl = fitControl,
verbose = FALSE,
## Only a single model can be passed to the
## function when no resampling is used:
tuneGrid = data.frame(mtry=25),
metric = "RMSPE")
rfFit2
predict(rfFit2, newdata = head(val))
pred <- predict(rfFit2, newdata = test[,-c(2)])
submission <- data.frame(Id=test$Id, Sales=pred)
cat("saving the submission file\n")
write.csv(submission, "rf1.csv")
###############################################################################
fitControl <- trainControl(
## 2-fold CV
method = "repeatedcv",
number = 2,
## repeated 2 times
repeats = 2,
summaryFunction=rmspe)
gbmGrid <-  expand.grid(interaction.depth = c(1, 3, 5),
n.trees = (1:10)*50,
shrinkage = c(0.1, 0.15),
n.minobsinnode = 20)
gbmFit1 <- train(Sales ~ ., data = train[samp,],
method = "gbm",
trControl = fitControl,
metric = "RMSPE",
maximize = F,
verbose = T,
tuneGrid = gbmGrid)
gbmFit1
trellis.par.set(caretTheme())
ggplot(gbmFit1)
###############################################################################
resamps <- resamples(list(GBM = gbmFit1,
RF = rfFit1))
resamps
summary(resamps)
a <- 1
b <- 2
library(e1071)
## Categorical data only:
data(HouseVotes84, package = "mlbench")
install.packages("nlbench")
install.packages("mlbench")
data(HouseVotes84, package = "mlbench")
View(HouseVotes84)
trainSamples <- 200
model <- naiveBayes(Class ~ ., data = HouseVotes84[1:trainSamples,])
model
predict(model, HouseVotes84[201:435,])
predict(model, HouseVotes84[201:435,], type = "raw")[1:10,]
pred <- predict(model, HouseVotes84[201:435,])
table(pred, HouseVotes84$Class[201:435])
## Example with numerical data:
data(iris)
iris
samp <- sample(1:nrow(iris), 0.75*nrow(iris))
m <- naiveBayes(Species ~ ., data = iris[samp,])
m <- naiveBayes(iris[samp,-5], iris[samp,5])
m
View(iris)
library(kknn)
data(iris)
m <- dim(iris)[1]
val <- sample(1:m, size = round(m/3), replace = FALSE,
prob = rep(1/m, m))
iris.learn <- iris[-val,]
iris.valid <- iris[val,]
iris.kknn <- kknn(Species~., iris.learn, iris.valid, distance = 1,
kernel = "triangular")
summary(iris.kknn)
fit <- fitted(iris.kknn)
table(iris.valid$Species, fit)
pcol <- as.character(as.numeric(iris.valid$Species))
pairs(iris.valid[1:4], pch = pcol, col = c("green3", "red")
[(iris.valid$Species != fit)+1])
data(ionosphere)
View(ionosphere)
ionosphere.learn <- ionosphere[1:200,]
ionosphere.valid <- ionosphere[-c(1:200),]
fit.kknn <- kknn(class ~ ., ionosphere.learn, ionosphere.valid)
table(ionosphere.valid$class, fit.kknn$fit)
(fit.train1 <- train.kknn(class ~ ., ionosphere.learn, kmax = 15,
kernel = c("triangular", "rectangular", "epanechnikov", "optimal"), distance = 1))
plot(fit.train1)
table(predict(fit.train1, ionosphere.valid), ionosphere.valid$class)
(fit.train2 <- train.kknn(class ~ ., ionosphere.learn, kmax = 15,
kernel = c("triangular", "rectangular", "epanechnikov", "optimal"), distance = 2))
plot(fit.train2)
galton <- read.csv("http://blog.yhathq.com/static/misc/galton.csv",
header=TRUE, stringsAsFactors=FALSE)
View(galton)
summary(galton)
head(galton)
fit <- lm(child ~ parent, data=galton)
fit
summary(fit)
library(reshape2)
phones <- melt(WorldPhones)
View(phones)
names(phones) <- c("year", "continent", "n_phones")
head(phones)
fit <- lm(n_phones ~ year + continent, data=phones)
summary(fit)
library(rpart)
Kyphosis
# grow tree
fit <- rpart(Kyphosis ~ Age + Number + Start,
method="class", data=kyphosis)
summary(fit) # detailed summary of splits
plot(fit, uniform=TRUE,
main="Classification Tree for Kyphosis")
text(fit, use.n=TRUE, all=TRUE, cex=.8)
kyphosis
# grow tree
fit <- rpart(Mileage~Price + Country + Reliability + Type,
method="anova", data=cu.summary)
summary(fit) # detailed summary of splits
# plot tree
plot(fit, uniform=TRUE,
main="Regression Tree for Milege ")
text(fit, use.n=TRUE, all=TRUE, cex=.8)
library("party", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.2")
library("randomForest")
## Classification:
##data(iris)
set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
proximity=TRUE)
print(iris.rf)
## Look at variable importance:
round(importance(iris.rf), 2)
(swiss.rf <- randomForest(sqrt(Fertility) ~ . - Catholic + I(Catholic < 50),
data=swiss))
library(gbm)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1])
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# introduce some missing values
X1[sample(1:N,size=500)] <- NA
X4[sample(1:N,size=300)] <- NA
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
head(data)
library(gbm)
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1])
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# introduce some missing values
X1[sample(1:N,size=500)] <- NA
X4[sample(1:N,size=300)] <- NA
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# fit initial model
gbm1 <-
gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
data=data,                   # dataset
distribution="gaussian",     # see the help for other choices
n.trees=1000,                # number of trees
shrinkage=0.05,              # shrinkage or learning rate,
# 0.001 to 0.1 usually work
interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc.
bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
train.fraction = 0.5,        # fraction of data for training,
# first train.fraction*N used for training
n.minobsinnode = 10,         # minimum total weight needed in each node
cv.folds = 3,                # do 3-fold cross-validation
keep.data=TRUE,              # keep a copy of the dataset with the object
verbose=FALSE,               # don't print out progress
n.cores=1)                   # use only a single core (detecting #cores is
# error-prone, so avoided here)
# check performance using an out-of-bag estimator
# OOB underestimates the optimal number of iterations
best.iter <- gbm.perf(gbm1,method="OOB")
print(best.iter)
# check performance using a 50% heldout test set
best.iter <- gbm.perf(gbm1,method="test")
print(best.iter)
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(gbm1,method="cv")
print(best.iter)
summary(gbm1,n.trees=1)         # based on the first tree
summary(gbm1,n.trees=best.iter) # based on the estimated best number of trees
library(randomForest)
set.seed(616)
cat("Чтение обучающей и тестовой выборок\n")
train <- read.csv("../input/train.csv")
test  <- read.csv("../input/test.csv")
store <- read.csv("../input/store.csv")
cat("Что же у нас есть?\n")
head(train)
head(test)
head(store)
train <- merge(train, store)
test <- merge(test, store)
cat("Имена переменных\n")
head(train)
head(test)
###############################################################################
cat("Если магазин закрыт...\n")
table(train$Sales[train$Open==0])
sum(test$Open==0)
sum(test$Open==0, na.rm = T)
table(test$Open)
table(test$Open, useNA = "always")
test[is.na(test$Open),]
table(train$Open, useNA = "always")
cat("Оставляем в тренировочной выборке только объекты, когда магазин был открыт\n")
train <- train[ which(train$Open=='1'),]
table(train$Date)
train$month <- as.integer(format(as.Date(train$Date), "%m"))
train$year <- as.integer(format(as.Date(train$Date), "%y"))
train$day <- as.integer(format(as.Date(train$Date), "%d"))
head(train)
test$month <- as.integer(format(as.Date(test$Date), "%m"))
test$year <- as.integer(format(as.Date(test$Date), "%y"))
test$day <- as.integer(format(as.Date(test$Date), "%d"))
head(test)
sort(unique(test$Date))
print(sort(unique(test$Date)))
summary(train)
plot(table(train$Sales))
cat("Визуализация")
library(forecast)
agg<-aggregate(Sales ~ ., data=train[, c("Sales", "month" ,"year")], FUN=sum)
ts<-ts(agg$Sales/1000000, start=2013, frequency=12)
col = rainbow(3)
seasonplot(ts, col=col, year.labels.left = TRUE, pch=19, las=1, main = "Продажи по годам", ylab = "Суммарные продажи(млн)")
meanAnnual<-aggregate(Sales ~ ., data=agg[, c("Sales" ,"year")], FUN=mean)[-3,]
abline(h=meanAnnual[1,2]/1000000, col=col[1], lty=2)
abline(h=meanAnnual[2,2]/1000000, col=col[2], lty=2)
abline(v=7, col="grey50", lty=2)
cat("Визуализация")
library(forecast)
agg<-aggregate(Sales ~ ., data=train[, c("Sales", "month" ,"year")], FUN=median)
ts<-ts(agg$Sales/1000000, start=2013, frequency=12)
col = rainbow(3)
seasonplot(ts, col=col, year.labels.left = TRUE, pch=19, las=1, main = "Продажи по годам", ylab = "Суммарные продажи(млн)")
meanAnnual<-aggregate(Sales ~ ., data=agg[, c("Sales" ,"year")], FUN=mean)[-3,]
abline(h=meanAnnual[1,2]/1000000, col=col[1], lty=2)
abline(h=meanAnnual[2,2]/1000000, col=col[2], lty=2)
abline(v=7, col="grey50", lty=2)
cat("Визуализация")
library(forecast)
agg<-aggregate(Sales ~ ., data=train[, c("Sales", "month" ,"year")], FUN=mean)
ts<-ts(agg$Sales/1000000, start=2013, frequency=12)
col = rainbow(3)
seasonplot(ts, col=col, year.labels.left = TRUE, pch=19, las=1, main = "Продажи по годам", ylab = "Суммарные продажи(млн)")
meanAnnual<-aggregate(Sales ~ ., data=agg[, c("Sales" ,"year")], FUN=mean)[-3,]
abline(h=meanAnnual[1,2]/1000000, col=col[1], lty=2)
abline(h=meanAnnual[2,2]/1000000, col=col[2], lty=2)
abline(v=7, col="grey50", lty=2)
cat("Визуализация")
library(forecast)
agg<-aggregate(Sales ~ ., data=train[train$Store==1, c("Sales", "month" ,"year")], FUN=sum)
ts<-ts(agg$Sales/1000000, start=2013, frequency=12)
col = rainbow(3)
seasonplot(ts, col=col, year.labels.left = TRUE, pch=19, las=1, main = "Продажи по годам (Store 1)", ylab = "Продажи")
meanAnnual<-aggregate(Sales ~ ., data=agg[, c("Sales" ,"year")], FUN=mean)[-3,]
abline(h=meanAnnual[1,2]/1000000, col=col[1], lty=2)
abline(h=meanAnnual[2,2]/1000000, col=col[2], lty=2)
abline(v=7, col="grey50", lty=2)
plotStore <- function(dt, store) {
dt <- subset(dt, dt$Store == store)
dt$Date = as.Date(dt$Date, format = "%Y-%m-%d")
dt <- dt[order(dt$Date),]
plot(dt$Sales ~ dt$Date,
col = dt$DayOfWeek,
pch = (as.integer(dt$StateHoliday)) + 19,
main = paste("store", store, "  n =", nrow(dt), "  ", "Assortment = ", dt$Assortment[1], ", StoreType = ",  dt$StoreType[1]))
legend("topleft", legend=levels(factor(dt$DayOfWeek)), text.col=seq_along(levels(factor(dt$DayOfWeek))))
return
}
sapply(sample(length(unique(train$Store)), 20), function(store) {
# sapply(1:length(unique(train$Store)), function(store) {
plotStore(train, store)
})
table(train$DayOfWeek)
cat("Удаляем лишнее\n")
train$Date <- NULL
test$Date <- NULL
train$Customers <- NULL
cat("Пытаемся обучить RF")
clf <- randomForest(train[,-c(4)], train$Sales, ntree=10, do.trace=TRUE)
cat("С первого раза не получилось...\n")
colSums(is.na(train))
train[is.na(train)] <- -99999
test[is.na(test)] <- -99999
head(train,1)
samp <- sample(1:nrow(train), 10000)
clf <- randomForest(train[samp,-c(3)], train$Sales[samp], ntree=10, do.trace=TRUE)
cat("Feature Importance\n")
varImpPlot(clf)
cat("Легко дообучить\n")
clf <- grow(clf, 10)
print(clf)
val <- train[train$year ==  15 & train$month >= 6,]
train <- train[train$year ==  15 & train$month < 6,]
head(train,1)
head(val,1)
samp <- sample(1:nrow(train), 20000)
clf <- randomForest(train[samp,-c(3)], train$Sales[samp], ntree=10, do.trace=TRUE)
pred <- predict(clf, val[,-c(3)])
rmspe <- function(actual, pred, lev = NULL, model = NULL) {
cond <- actual > 0
out <- sqrt(mean(((actual[cond]-pred[cond])/actual[cond])^2))
names(out) <- "RMSPE"
out
}
cat("Первый результат:")
rmspe(val$Sales,pred)
plot(clf)
library(caret)
set.seed(825)
rmspe <- function(data, lev = NULL, model = NULL) {
cond <- data$obs > 0
out <- sqrt(mean(((data$obs[cond]-data$pred[cond])/data$obs[cond])^2))
names(out) <- "RMSPE"
out
}
fitControl <- trainControl(
## 2-fold CV
method = "repeatedcv",
number = 2,
## repeated 2 times
repeats = 2,
summaryFunction=rmspe)
rfFit1 <- train(Sales ~ ., data = train[samp,],
method = "rf",  ntree=10,
trControl = fitControl,
metric = "RMSPE",
maximize = F,
verbose = T)
rfFit1
trellis.par.set(caretTheme())
ggplot(rfFit1)
fitControl <- trainControl(method = "none")
set.seed(825)
rfFit2 <- train(Sales ~ ., data = train[samp,],
method = "rf", ntree=10,
trControl = fitControl,
verbose = FALSE,
## Only a single model can be passed to the
## function when no resampling is used:
tuneGrid = data.frame(mtry=25),
metric = "RMSPE")
rfFit2
predict(rfFit2, newdata = head(val))
pred <- predict(rfFit2, newdata = test[,-c(2)])
submission <- data.frame(Id=test$Id, Sales=pred)
cat("saving the submission file\n")
write.csv(submission, "rf1.csv")
fitControl <- trainControl(
## 2-fold CV
method = "repeatedcv",
number = 2,
## repeated 2 times
repeats = 2,
summaryFunction=rmspe)
gbmGrid <-  expand.grid(interaction.depth = c(1, 3, 5),
n.trees = (1:5)*50,
shrinkage = c(0.1, 0.15),
n.minobsinnode = 20)
gbmFit1 <- train(Sales ~ ., data = train[samp,],
method = "gbm",
trControl = fitControl,
metric = "RMSPE",
maximize = F,
verbose = T,
tuneGrid = gbmGrid)
gbmFit1
trellis.par.set(caretTheme())
ggplot(gbmFit1)
resamps <- resamples(list(GBM = gbmFit1,
RF = rfFit1))
resamps
summary(resamps)
